{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6.1\t基本存储：存储至txt或csv"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1.1把数据存储至txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "title = \"This is a test sentence.\"\n",
    "with open('title.txt', \"a+\") as f:\n",
    "    f.write(title)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "title = \"This is a test sentence.\"\n",
    "with open(r'C:\\Users\\Administrator\\Desktop\\title.txt', \"a+\") as f:\n",
    "    f.write(title)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "output = '\\t'.join(['name','title','age','gender'])\n",
    "with open('C:\\\\Users\\\\Administrator\\\\desktop\\\\test.txt', \"a+\") as f:\n",
    "    f.write(output)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "This is a test sentence.\n"
     ]
    }
   ],
   "source": [
    "with open('title.txt', \"r\", encoding ='utf-8') as f:\n",
    "    result = f.read()\n",
    "    print (result)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.1.2把数据存储至csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['A1', 'B1', 'C1', 'D1']\n",
      "A1\n",
      "['A2', 'B2', 'C2', 'D2']\n",
      "A2\n",
      "['A3', 'B3', 'C3', 'D3']\n",
      "A3\n",
      "['A4', 'B4', 'C4', 'D4']\n",
      "A4\n"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "with open('test.csv', 'r',encoding='utf-8') as csvfile:\n",
    "    csv_reader = csv.reader(csvfile)\n",
    "    for row in csv_reader:\n",
    "        print(row)\n",
    "        print(row[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    "output_list = ['1', '2','3','4']\n",
    "with open('test2.csv', 'a+', encoding='utf-8', newline='') as csvfile:\n",
    "    w = csv.writer(csvfile)\n",
    "    w.writerow(output_list)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2 储存至MySQL数据库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.2.3 Python操作MySQL数据库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#coding=utf-8\n",
    "import MySQLdb\n",
    "\n",
    "conn= MySQLdb.connect(host='localhost' , user='root', passwd='root', db ='scraping')\n",
    "cur = conn.cursor()\n",
    "\n",
    "cur.execute(\"INSERT INTO urls (url, content) VALUES ('www.baidu.com', 'This is content.')\")\n",
    "cur.close()\n",
    "conn.commit()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "UnicodeEncodeError",
     "evalue": "'latin-1' codec can't encode characters in position 7-10: ordinal not in range(256)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnicodeEncodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-6-dcdae84a66a5>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m     \u001b[0murl\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0meachone\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'href'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m     16\u001b[0m     \u001b[0mtitle\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0meachone\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m---> 17\u001b[0;31m     \u001b[0mcur\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexecute\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"INSERT INTO urls (url, content) VALUES (%s, %s)\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtitle\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     18\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m     19\u001b[0m \u001b[0mcur\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\MySQLdb\\cursors.py\u001b[0m in \u001b[0;36mexecute\u001b[0;34m(self, query, args)\u001b[0m\n\u001b[1;32m    232\u001b[0m                 \u001b[0margs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mdict\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mliteral\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mitem\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mkey\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mitem\u001b[0m \u001b[1;32min\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mitems\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    233\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 234\u001b[0;31m                 \u001b[0margs\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtuple\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmap\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mliteral\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0margs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    235\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mPY2\u001b[0m \u001b[1;32mand\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mquery\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mbytes\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbytearray\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    236\u001b[0m                 \u001b[0mquery\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mquery\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0municode_literal\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcharset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\MySQLdb\\connections.py\u001b[0m in \u001b[0;36mliteral\u001b[0;34m(self, o)\u001b[0m\n\u001b[1;32m    314\u001b[0m         \u001b[0mapplications\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    315\u001b[0m         \"\"\"\n\u001b[0;32m--> 316\u001b[0;31m         \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mescape\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mo\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencoders\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    317\u001b[0m         \u001b[1;31m# Python 3(~3.4) doesn't support % operation for bytes object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    318\u001b[0m         \u001b[1;31m# We should decode it before using %.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;32mC:\\ProgramData\\Anaconda3\\lib\\site-packages\\MySQLdb\\connections.py\u001b[0m in \u001b[0;36municode_literal\u001b[0;34m(u, dummy)\u001b[0m\n\u001b[1;32m    224\u001b[0m                 \u001b[1;31m# unicode_literal() is called for arbitrary object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    225\u001b[0m                 \u001b[1;32mdef\u001b[0m \u001b[0municode_literal\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mu\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdummy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m--> 226\u001b[0;31m                     \u001b[1;32mreturn\u001b[0m \u001b[0mdb\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstring_literal\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mu\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mencode\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0municode_literal\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcharset\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    227\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0municode_literal\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m    228\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mUnicodeEncodeError\u001b[0m: 'latin-1' codec can't encode characters in position 7-10: ordinal not in range(256)"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import MySQLdb\n",
    "\n",
    "conn= MySQLdb.connect(host='localhost' , user='root', passwd='root',db ='scraping')\n",
    "cur = conn.cursor()\n",
    "\n",
    "link = \"http://www.santostang.com/\"\n",
    "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
    "r = requests.get(link, headers= headers)\n",
    "\n",
    "soup = BeautifulSoup(r.text, \"lxml\")\n",
    "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
    "for eachone in title_list:\n",
    "    url = eachone.a['href']\n",
    "    title = eachone.a.text.strip()\n",
    "    cur.execute(\"INSERT INTO urls (url, content) VALUES (%s, %s)\", (url, title))\n",
    "    \n",
    "cur.close()\n",
    "conn.commit()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "from bs4 import BeautifulSoup\n",
    "import MySQLdb\n",
    "\n",
    "conn = MySQLdb.connect(host='localhost', user='root', passwd='root', db = 'scraping', charset=\"utf8\")\n",
    "cur = conn.cursor()\n",
    "\n",
    "link = \"http://www.santostang.com/\"\n",
    "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
    "r = requests.get(link, headers= headers)\n",
    "\n",
    "soup = BeautifulSoup(r.text, \"lxml\")\n",
    "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
    "for eachone in title_list:\n",
    "    url = eachone.a['href']\n",
    "    title = eachone.a.text.strip()\n",
    "    cur.execute(\"INSERT INTO urls (url, content) VALUES (%s, %s)\", (url, title))\n",
    "    \n",
    "cur.close()\n",
    "conn.commit()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 6.3 储存至MongoDB数据库"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 6.3.3 Python操作MongoDB数据库"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pymongo import MongoClient\n",
    "client = MongoClient('localhost',27017)\n",
    "db = client.blog_database\n",
    "collection = db.blog"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import requests\n",
    "import datetime\n",
    "from bs4 import BeautifulSoup\n",
    "from pymongo import MongoClient\n",
    "\n",
    "client = MongoClient('localhost',27017)\n",
    "db = client.blog_database\n",
    "collection = db.blog\n",
    "\n",
    "link = \"http://www.santostang.com/\"\n",
    "headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} \n",
    "r = requests.get(link, headers= headers)\n",
    "\n",
    "soup = BeautifulSoup(r.text, \"lxml\")\n",
    "title_list = soup.find_all(\"h1\", class_=\"post-title\")\n",
    "for eachone in title_list:\n",
    "    url = eachone.a['href']\n",
    "    title = eachone.a.text.strip()\n",
    "    post = {\"url\": url,\n",
    "         \"title\": title,\n",
    "         \"date\": datetime.datetime.utcnow()}\n",
    "    collection.insert_one(post)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
